WORK IN PROGRESS
COMPLETED, BUT UNPOLISHED, WORK AVAILABLE HERE
How: Construct a model that predicts whether an individual makes more than 50k/yr, a value associated with being a candidate for giving donations
Data Source: 1994 US Census Data UCI Machine Learning Repository
Note: Datset donated by Ron Kohavi and Barry Becker, from the article "Scaling Up the Accuracy of Naive-Bayes Classifiers: A Decision-Tree Hybrid". Small changes to the dataset have been made, such as removing the 'fnlwgt' feature and records with missing or ill-formatted entries.
1.1 Data Dictionary
1.2 Simple Cleaning
1.3 Summary Statistics
1.4 Distributions
1.5 Skew and Variance
1.6 Relationships
2.1 Separate Labels from Factors
2.2 Transformation2.2.1 Indicator Variables
2.2.2 Impact
2.2.3 Logarithmic Transform
2.2.4 Normalization and Standardization
3.Metrics
3.1 Accuracy
3.2 Precision
3.3 Recall
3.4 F$\beta$-Score
4.Models
4.1 Selection
4.2.1 Application
4.3 Model Application Pipeline CHANGE TO ACCOUNT FOR DIFFERENT DATA, TUNED OR ALL DATA
4.4.1 Application
4.4.2 Tuning4.5 Random Forest
4.5.1 Application
4.5.2 Tuning CURRENTLY ADDRESSING FACTORS OF INPUT4.6 Ada Boost
4.6.1 Application
4.6.2 Tuning4.7 Comparison
4.7.1 Feature Importance
4.7.2 Selection
5.Summary
import numpy as np # Library for numerical computing with Python
import pandas as pd # Library to work with data in tabular form and the like
from time import time # Package to work with time values
from IPython.display import display # Allows the use of display() for DataFrames
import matplotlib.pyplot as plt # Package for plotting
import seaborn as sns # Library for plotting, prettier than matplotlib
import visuals as vs # Adapted from Udacity
import plotly.graph_objects as go # Interactive plots
import plotly.express as px # Interactive plots
from plotly.subplots import make_subplots # Interactive plots
from dython.nominal import associations # Categorical plots
import statsmodels.api as sm # Statistical analysis toolbox
from scipy.stats import skew # Tool to evaluate statistical measure
from sklearn.preprocessing import MinMaxScaler # Feature scaling tool
from sklearn.model_selection import train_test_split, GridSearchCV # Data splitting and tuning
from sklearn.naive_bayes import MultinomialNB # Naive Bayes Classifier model
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV # Logistic Regression model
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier # Ensemble models
from sklearn.metrics import fbeta_score, accuracy_score, make_scorer # Model metrics
# iPython Notebook formatting
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# Account for changes made to imported packages
%load_ext autoreload
%autoreload 2
data = pd.read_csv("census.csv")
Standardizing factor names by PEP8 Naming Convention Standards can be helpful.
There are a number of categorical variables. Handling those with one-hot encoding can be helpful.
name_changes = {x: x.lower().replace("-", "_") for x in data.columns.tolist() if "-" in x}
data = data.rename(columns=name_changes)
data.info(null_counts=True) # Show information for each factor: NaN counts and data-type of column
data.describe(include='all').T # Summarize each factor, transpose the summary
n_records = data.shape[0] # First element of .shape indicates n
n_greater_50k = data[data['income'] == '>50K'].shape[0] # n of those with income > 50k
n_at_most_50k = data.where(data['income'] == '<=50K').dropna().shape[0] # .where() requires dropping na for this
greater_percent = round((n_greater_50k / n_records)*100,2) # Show proportion of > 50k to whole data
data_details = {"Number of observations": n_records,
"Number of people with income > 50k": n_greater_50k,
"Number of people with income <= 50k": n_at_most_50k,
"Percent of people with income > 50k": greater_percent} # Cache values of analysis
for item in data_details: # Iterate through the cache
print("{0}: {1}".format(item, data_details[item])) # Print the values
fig = px.histogram(data, x="income", nbins=2)
fig.update_layout(height=600, width=750,
title_text="Distribution of Income",
showlegend=False)
fig.update_yaxes(title_text="Number of Records")
fig.show()
fig = px.histogram(data, x="age", nbins=data['age'].nunique(), color='income', opacity=0.75)
fig.update_layout(height=600, width=750,
title_text="Distribution of Age",
showlegend=True)
fig.update_yaxes(title_text="Number of Records")
fig.update_xaxes(title_text="Age")
fig.show()
fig = px.histogram(data, x="workclass", color='income', opacity=0.75)
fig.update_layout(height=600, width=750,
title_text="Distribution of Workclass",
showlegend=True)
fig.update_yaxes(title_text="Number of Records")
fig.update_xaxes(title_text="Classification of Workclass", )
fig.show()
fig = px.histogram(data, x="education_level", color='income', opacity=0.75)
fig.update_layout(height=600, width=750,
title_text="Distribution of Education",
showlegend=True)
fig.update_yaxes(title_text="Number of Records")
fig.update_xaxes(title_text="Classification of Education", )
fig.show()
fig = px.histogram(data, x="marital_status", color='income', opacity=0.75)
fig.update_layout(height=600, width=750,
title_text="Distribution of Marital-Status",
showlegend=True)
fig.update_yaxes(title_text="Number of Records")
fig.update_xaxes(title_text="Classification of Marital-Status")
fig.show()
fig = px.histogram(data, x="occupation", color='income', opacity=0.75)
fig.update_layout(height=600, width=750,
title_text="Distribution of Occupation",
showlegend=True)
fig.update_yaxes(title_text="Number of Records")
fig.update_xaxes(title_text="Classification of Occupation", )
fig.show()
fig = px.histogram(data, x="relationship", color='income', opacity=0.75)
fig.update_layout(height=600, width=750,
title_text="Distribution of Relationship",
showlegend=True)
fig.update_yaxes(title_text="Number of Records")
fig.update_xaxes(title_text="Classification of Relationship", )
fig.show()
fig = px.histogram(data, x="race", color='income', opacity=0.75)
fig.update_layout(height=600, width=750,
title_text="Distribution of Race",
showlegend=True)
fig.update_yaxes(title_text="Number of Records")
fig.update_xaxes(title_text="Classification of Race", )
fig.show()
fig = px.histogram(data, x="sex", color='income', opacity=0.75)
fig.update_layout(height=600, width=750,
title_text="Distribution of Sex",
showlegend=True)
fig.update_yaxes(title_text="Number of Records")
fig.update_xaxes(title_text="Classification of Sex", )
fig.show()
fig = px.histogram(data, x="hours_per_week", color='income', nbins=10, opacity=0.75)
fig.update_layout(height=600, width=750,
title_text="Distribution of Hours-per-Week",
showlegend=True)
fig.update_yaxes(title_text="Number of Records")
fig.update_xaxes(title_text="Hours Worked in a Week")
fig.show()
sns.set_context("paper", rc={"font.size":16,
"axes.titlesize":16,
"axes.labelsize":16,
"lines.linewidth": 2.5,
"legend.fontsize":12})
sns.pairplot(data[['income', 'age', 'education_num', 'hours_per_week']],
kind="reg",
hue='income',
height=4,
plot_kws=dict(scatter_kws=dict(s=9)))
plt.show()
The features capital_gain and capital_loss are positively skewed (i.e. have a long tail in the positive direction).
To reduce this skew, a logarithmic transformation, $\tilde x = \ln\left(x\right)$, can be applied. This transformation will reduce the amount of variance and pull the mean closer to the center of the distribution.
Why does this matter: The extreme points may affect the performance of the predictive model.
Why care: We want an easily discernible relationship between the independent and dependent variables; the skew makes that more complicated.
Why DOESN'T this matter: The distribution of the independent variables is not an assumption of most models, but the distribution of the residuals and homoskedasticity of the independent variable, given the independent variables, $E\left(u | x\right) = 0$ where $u = Y - \hat{Y}$ is of linear regression. In this analysis, the dependent variable is categorical (i.e. discrete or non-continuous) and linear regression is not an appropriate model.
cap_loss = data['capital_loss']
cap_gain = data['capital_gain']
cap_loss_skew, cap_loss_var, cap_loss_mean = skew(cap_loss), np.var(cap_loss), np.mean(cap_loss)
cap_gain_skew, cap_gain_var, cap_gain_mean = skew(cap_gain), np.var(cap_gain), np.mean(cap_gain)
fac_df = pd.DataFrame({'Feature': ['Capital Loss', 'Capital Gain'],
'Skewness': [cap_loss_skew, cap_gain_skew],
'Mean': [cap_loss_mean, cap_gain_mean],
'Variance': [cap_loss_var, cap_gain_var]})
display(fac_df)
fig = make_subplots(rows=2, cols=1)
fig.update_layout(height=800, width=850,
title_text="Skewed Distributions of Continuous Census Data Features",
showlegend=False
)
fig.add_trace(
go.Histogram(x=data['capital_loss'], nbinsx=25,
name='Capital-Loss'),
row=1, col=1
)
fig.add_trace(
go.Histogram(x=data['capital_gain'], nbinsx=25,
name='Capital-Gain'),
row=2, col=1
)
fig.update_xaxes(title_text="Capital-Loss Feature Distribution", row=1, col=1)
fig.update_xaxes(title_text="Capital-Gain Feature Distribution", row=2, col=1)
for i in range(1,5):
fig.update_yaxes(title_text="Number of Records", range=[0, 2000],
patch = dict(
tickmode = 'array',
tickvals = [0, 500, 1000, 1500, 2000],
ticktext = [0, 500, 1000, 1500, ">2000"]),
row=i, col=1)
fig.show()
Toward determing what factors should be included in the model, there is something to note with regard to categorical versus continuous variables.
Correlation is defined as: $$r = \frac{\sum\left(X-\bar{X}\right)\cdot\left(Y-\bar{Y}\right)}{\sqrt{(\sum\left(X-\bar{X}\right)^{2})}\cdot\sqrt{\sum\left(Y-\bar{Y}\right)^{2}}}$$
This is inconsistent with categorical variables. Instead, it can be useful to utilize the uncertainty coefficient, or Thiel's Index.
Where we have entropy of a single distribution:
$$H\left(X\right)=-\sum_{x} P_{x}\left(x\right)log\ P_{x}\left(x\right)$$Conditional entropy as:
$$H\left(X|Y\right) = - \sum_{x,y} P_{X,Y}\left(x,y\right)log\ P_{X|Y}\left(x|y\right)$$and the uncertainty coefficient as:
$$U\left(X|Y\right)=\frac{H\left(x\right)-H\left(X|Y\right)}{H\left(X\right)} = \frac{I\left(X;Y\right)}{H\left(X\right)}$$Where $I\left(X;Y\right)$ is the mutual information, or the amount of information obtained about one random variable through observing the other random variable.
To quote Shaked Zychlinski, "given the value of x, how many possible states does y have, and how often do they occur".
So, can this help us discenr some information about what to do with our factors?
I will step forward now with the idea that colinearity, where one variable can easily be derived from another within the model, is not desired (i.e. two variables with strong relationships on one another should not be included as they may reduce the predictive power of the model).
Citation: Shaked Zychlinski
A model including:
age and marital_status (0.56)
age&incomeis 0.24marital_status&incomeis 0.20
dropmarital_status
age and relationship (0.46)
ageandincomeis 0.24relationshipandincomeis 0.21- drop
relationship
education_num and occupation (0.57)
education_numandincomeis 0.33occupationandincomeis 0.11- drop
occupation
marital_status and relationship (0.49)
- already determined that
marital_statusandrelationshipwould be dropped from model
associations(dataset=data, mark_columns=True, theil_u=True, figsize=(15,15), cmap='coolwarm')
plt.show()
2.1 Separate Labels from Factors
2.2 Transformation
2.2.1 Indicator Variables
2.2.2 Logarithmic Transform
2.2.3 Impact
2.2.4 Normalization and Standardization
For training an algorithm, it is useful to separate the label, or dependent variable ($Y$) from the rest of the data training_features, or independent variables ($X$).
Y = data['income']
X = data.drop(['income'], axis=1)
A common way to handle categorical variables is to make indicator, or dummy, variables from the values of the factors.
Pandas has a simple method, .get_dummies(), that can perform this very quickly.
Further, this will create a new variable for every value a categorical variable takes as demonstrated in this example:
| someFeature | someFeature_A | someFeature_B | someFeature_C | ||
|---|---|---|---|---|---|
| 0 | B | 0 | 1 | 0 | |
| 1 | C | ----> one-hot encode ----> | 0 | 0 | 1 |
| 2 | A | 1 | 0 | 0 |
Which means the p, or number of factors, will grow, and can do so potentially in a large way. Specifically, if p is the number of factors and pI is the number of factors after creating indicator variables:
$$pI = p + \left(number\ of\ distinct\ categories\right) \cdot \left(number\ of\ categorical\ variables\right)$$
It is also worth noting that for modeling, it is important that once value of the factor, a "base case", be dropped from the data. This is because the base case is redundant, i.e. can be infered perfectly from the other cases, and, more specifically and more detrimental to our model, it leads to multicollinearity of the terms.
In some models (e.g. logistic regression, linear regression), an assumption of no multicollinearity must hold.
So, the final number of factors after creating indicator variables and dropping the base case is: $$\tilde{p}=pI - \left(number\ of\ categorical\ variables\right)$$
factors = ['age', 'workclass', 'education_level', 'education_num', 'marital_status',
'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
'hours_per_week', 'native_country']
unencoded = len(list(X.columns))
X = pd.get_dummies(X[factors], drop_first=True) # Create dummies, dropping the base case
Y = (Y == '>50K').apply(lambda x: x*1)
encoded = len(list(X.columns))
print("{} total features before one-hot encoding.".format(unencoded))
print("{} total features after one-hot encoding.".format(encoded))
To reduce skew, a logarithmic transformation, $\tilde x = \ln\left(x\right)$, can be applied. This transformation will reduce the amount of variance and pull the mean closer to the center of the distribution.
The logarithmic transformation reduced the skew and the variance of each factor.
| Feature | Skewness | Mean | Variance |
|---|---|---|---|
| Capital Loss | 4.516154 | 88.595418 | 163985.81018 |
| Capital Gain | 11.788611 | 1101.430344 | 56345246.60482 |
| Log Capital Loss | 4.271053 | 0.355489 | 2.54688 |
| Log Capital Gain | 3.082284 | 0.740759 | 6.08362 |
skewed = ['capital_gain', 'capital_loss']
X_log_transformed = pd.DataFrame(data=X).copy()
X_log_transformed[skewed] = X[skewed].apply(lambda x : np.log(x + 1))
fig = make_subplots(rows=2, cols=1)
fig.update_layout(height=800, width=850,
title_text="Skewed Distributions of Continuous Census Data Features",
showlegend=False)
fig.add_trace(
go.Histogram(x=X_log_transformed['capital_loss'], nbinsx=25,
name='Log of Capital-Loss'),
row=1, col=1)
fig.add_trace(
go.Histogram(x=X_log_transformed['capital_gain'], nbinsx=25,
name='Log of Capital-Gain'),
row=2, col=1)
fig.update_xaxes(title_text="Log of Capital-Loss Feature Distribution", row=1, col=1)
fig.update_xaxes(title_text="Log of Capital-Gain Feature Distribution", row=2, col=1)
for i in range(1,3):
fig.update_yaxes(title_text="Number of Records", range=[0, 2000],
patch = dict(
tickmode = 'array',
tickvals = [0, 500, 1000, 1500, 2000],
ticktext = [0, 500, 1000, 1500, ">2000"]),
row=i, col=1)
fig.show()
log_cap_loss_skew = skew(X_log_transformed['capital_loss'])
log_cap_loss_var = round(np.var(X_log_transformed['capital_loss']),5)
log_cap_loss_mean = np.mean(X_log_transformed['capital_loss'])
log_cap_gain_skew = skew(X_log_transformed['capital_gain'])
log_cap_gain_var = round(float(np.var(X_log_transformed['capital_gain'])),5)
log_cap_gain_mean = np.mean(X_log_transformed['capital_gain'])
log_fac_df = pd.DataFrame({'Feature': ['Log Capital Loss', 'Log Capital Gain'],
'Skewness': [log_cap_loss_skew, log_cap_gain_skew],
'Mean': [log_cap_loss_mean, log_cap_gain_mean],
'Variance': [log_cap_loss_var, log_cap_gain_var]})
fac_df = fac_df.append(log_fac_df, ignore_index=True)
fac_df['Variance'] = fac_df['Variance'].apply(lambda x: '%.5f' % x)
display(fac_df)
fig = make_subplots(rows=4, cols=1)
fig.update_layout(height=800, width=850,
title_text="Comparison of Distributions of Continuous Census Data Features",
showlegend=False
)
fig.add_trace(
go.Histogram(x=X['capital_loss'], nbinsx=25,
name='Capital-Loss'),
row=1, col=1
)
fig.add_trace(
go.Histogram(x=X_log_transformed['capital_loss'], nbinsx=25,
name='Log of Capital-Loss'),
row=2, col=1
)
fig.add_trace(
go.Histogram(x=X['capital_gain'], nbinsx=25,
name='Normalized Capital-Gain'),
row=3, col=1
)
fig.add_trace(
go.Histogram(x=X_log_transformed['capital_gain'], nbinsx=25,
name='Capital-Gain'),
row=4, col=1
)
fig.update_xaxes(title_text="Capital-Loss Feature Distribution", row=1, col=1)
fig.update_xaxes(title_text="Log of Capital-Loss Feature Distribution", row=2, col=1)
fig.update_xaxes(title_text="Capital-Gain Feature Distribution", row=3, col=1)
fig.update_xaxes(title_text="Log of Capital-Gain Feature Distribution", row=4, col=1)
for i in range(1,5):
fig.update_yaxes(title_text="Number of Records", range=[0, 2000],
patch = dict(
tickmode = 'array',
tickvals = [0, 500, 1000, 1500, 2000],
ticktext = [0, 500, 1000, 1500, ">2000"]),
row=i, col=1)
fig.show()
Originally, the influence of capital_loss on income was statistically significant, but after the logarithmic transformation, it is not.
Here it can be seen that with a change to the skew, the confidence interval now passes through zero whereas before it did not.
This passing through zero is interpreted as the independent variable being statistically indistinguishable from zero influence on the dependent variable.
train_0 = X['capital_loss']
logit_0 = sm.Logit(Y, train_0)
train_1 = X_log_transformed['capital_loss']
logit_1 = sm.Logit(Y, train_1)
# fit the model
result_0 = logit_0.fit(disp=0)
result_1 = logit_1.fit(disp=0)
# Results
print()
print("Original model")
print(result_0.summary2())
print()
print("Transformed model")
print(result_1.summary2())
These two terms, normalization and standardization, are frequently used interchangably, but have two different scaling purposes.
Earlier, capital_gain and capital_loss were transformed logarithmically, reducing their skew, and affecting the model's predictive power (i.e. ability to discern the relationship between the dependent and independent variables).
Another method of influencing the model's predictive power is normalization of independent variables which are numerical. Whereafter, each featured will be treated equally in the model.
However, after scaling is applied, observing the data in its raw form will no longer have the same meaning as before.
Note the output from scaling. age is no longer 39 but is instead 0.30137. This value is meaningful only in context of the rest of the data and not on its own.
scaler = MinMaxScaler(feature_range=(0, 1)) # default=(0, 1)
numerical = ['age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
X_log_minmax = pd.DataFrame(data = X_log_transformed).copy()
X_log_minmax[numerical] = scaler.fit_transform(X_log_transformed[numerical])
print("Original Data")
display(X.head(1))
# Show an example of a record with scaling applied
print("=" * 86)
print("Scaled Data")
display(X_log_minmax.head(1))
# Preserve final X transformation:
X_trans = X_log_minmax
fig = make_subplots(rows=4, cols=1)
fig.update_layout(height=800, width=850,
title_text="Comparison of Distributions of Continuous Census Data Features",
showlegend=False)
fig.add_trace(
go.Histogram(x=X_log_transformed['capital_loss'], nbinsx=25,
name='Log of Capital-Loss'),
row=1, col=1)
fig.add_trace(
go.Histogram(x=X_log_minmax['capital_loss'], nbinsx=25,
name='Normalized Capital-Loss'),
row=2, col=1)
fig.add_trace(
go.Histogram(x=X_log_transformed['capital_gain'], nbinsx=25,
name='Log of Capital-Gain'),
row=3, col=1)
fig.add_trace(
go.Histogram(x=X_log_minmax['capital_gain'], nbinsx=25,
name='Normalized Capital-Gain'),
row=4, col=1)
fig.update_xaxes(title_text="Log of Capital-Loss Feature Distribution", row=1, col=1)
fig.update_xaxes(title_text="Normalized Capital-Loss Feature Distribution", row=2, col=1)
fig.update_xaxes(title_text="Log of Capital-Gain Feature Distribution", row=3, col=1)
fig.update_xaxes(title_text="Normalized Capital-Gain Feature Distribution", row=4, col=1)
for i in range(1,5):
fig.update_yaxes(title_text="Number of Records", range=[0, 2000],
patch = dict(
tickmode = 'array',
tickvals = [0, 500, 1000, 1500, 2000],
ticktext = [0, 500, 1000, 1500, ">2000"]),
row=i, col=1)
fig.show()
After transforming with one-hot-encoding, all categorical variables have been converted into numerical features. Earlier, they were normalized (i.e. scaled between 0 and 1).
Next, for training a machine learning model, it is necessary to split the data into segments. One segment will be used for training the model, the training set, and the other set will be for testing the mode, the testing set.
A common method of splitting is to segment based on proportion of data. A general 80:20 rule is typical for training:test.
sklearn has a method that works well for this, .model_selection.train_test_split. Essentially, this randomly selects a portion of the data to segment to a training and to a testing set.
random_state: By setting a seed, option random_state, we can ensure the random splitting is the same for our model. This is necessary for evaluating the effectiveness of the model. Otherwise, we would be training and testing a model with the same proportional split (if we kept that static), but with different observations of the data.
test_size: This setting represents the proportion of the data to be tested. Generally, this is the complement (1 - x = c) of the training_size. For example, if test_size is 0.2, the test_size is 0.8.
stratify: Preserves the proportion of the label class in the split data. As an example, let 1 and 0 indicate the positive and negative cases of a label, respectively. It's possible that only positive or only negative classes exisst in either training or testing set (e.g. $\forall y \in Y_{train}, y = 1$). Better than avoid this worst case scenario, stratify will preserve the ratio of positive to negative classes in each training and testing set.
Here the data is split 80:20 with a seed set of 0 and the distribution of the label's classes preserved:
X_train, X_test, y_train, y_test = train_test_split(X_trans, Y, random_state=0, test_size=0.2, stratify=Y)
original_ratio = round(Y.value_counts()[1] / Y.value_counts()[0],2)
train_ratio = round(y_train.value_counts()[1] / y_train.value_counts()[0], 2)
test_ratio = round(y_test.value_counts()[1] / y_test.value_counts()[0], 2)
print('Original ratio of positive-to-negative classes: {}'.format(original_ratio))
print('Training ratio of positive-to-negative classes: {}'.format(train_ratio))
print('Testing ratio of positive-to-negative classes: {}'.format(test_ratio))
3.1 Accuracy
3.2 Precision
3.3 Recall
3.4 F$\beta$-Score
In terms of income as a predictor for donating, CharityML has stated they will most likely receive a donation from individuals whose income is in excess of 50,000/yr.
CharityML has limited funds to reach out to potential donors. Misclassifying a person as making more than 50,000yr is COSTLY for CharityML. It's more important that the model accurately predicts a person making more than 50,000/yr (i.e. true-positive) than accidentally predicting they do when they don't (i.e. false-positive).
Accuracy is a measure of the correctly predicted data points to total amount of data points:
$$Accuracy=\frac{\sum Correctly\ Classified\ Points}{\sum All\ Points}=\frac{\sum True\ Positives + \sum True\ Negatives}{\sum Observations}$$A Confusion Matrix demonstrates what a true/false positive/negative is:
| Predict 1 | Predict 0 | |
|---|---|---|
| True 1 | True Positive | False Negative |
| True 0 | False Positive | True Negative |
The errors of these are sometimes refered to as type errors:
| Predict 1 | Predict 0 | |
|---|---|---|
| True 1 | True Positive | Type 2 Error |
| True 0 | Type 1 Error | True Negative |
For this analysis, we want to avoid false positives or type 1 errors. Put differently, we prefer false negatives to false positives.
A model that meets that criteria, $False\ Negative \succ False\ Positive$, is known as preferring precision over recall, or is a high precision model.
Humorously and perhaps more understandably, these type errors can be demonstrate as such:

Precision is a measure of the amount of correctly predicted positive class to the amount of positive class predictions (correct as well as incorrect predictions of positive class):
$$Precision = \frac{\sum True\ Positives}{\sum True\ Positives + \sum False\ Positives}$$A model which avoids false positives would have a high precision value, or score. It may also be skewed toward false negatives.
Recall, sometimes refered to as a model's sensitivity, is a measure of the correctly predicted positive classes to the actual amount of positive classes (true positive and false negatives are each actual positive classes):
$$Recall = \frac{\sum True\ Positives}{\sum Actual\ Positives} = \frac{\sum True\ Positives}{\sum True\ Positives + \sum False\ Negatives}$$A mode which avoids false negatives would have a high recall value, or score. It may also be skewed toward false positives
An F-$\beta$ Score is a method of scoring a model both on precision and recall.
Where $\beta \in [0,\infty)$:
$$F_{\beta} = \left(1+\beta^{2}\right) \cdot \frac{Precision\ \cdot Recall}{\beta^{2} \cdot Precision + Recall}$$When $\beta = 0$, we get precision: $$F_{\beta=0} = \left(1+0^{2}\right) \cdot \frac{Precision\ \cdot Recall}{0^{2} \cdot Precision + Recall} = \left(1\right) \cdot \frac{Precision\ \cdot Recall}{Recall} = Precision$$
When $\beta = 1$, we get a harmonized mean of precision and recall:
$$F_{\beta=1} = \left(1+1^{2}\right) \cdot \frac{Precision\ \cdot Recall}{1^{2} \cdot Precision + Recall} = \left(2\right) \cdot \frac{Precision\ \cdot Recall}{Precision + Recall}$$... and when $\beta > 1$, we get something closer to recall:
$$F_{\beta \rightarrow \infty} = \left(1+\beta^{2}\right) \cdot \frac{Precision\ \cdot Recall}{\beta^{2} \cdot Precision + Recall} = \frac{Precision\ \cdot Recall}{\frac{\beta^{2}}{1+\beta^{2}} \cdot Precision + \frac{1}{1+ \beta^{2}} \cdot Recall}$$As $\beta \rightarrow \infty$: $$\frac{Precision\ \cdot Recall}{\frac{\beta^{2}}{1+\beta^{2}} \cdot Precision + \frac{1}{1+ \beta^{2}} \cdot Recall} \rightarrow \frac{Precision \cdot Recall}{1 \cdot Precision + 0 \cdot Recall} = \frac{Precision}{Precision} \cdot Recall = Recall$$
4.1 Selection
4.2.1 Application
4.3 Model Application Pipeline
4.4.1 Application
4.4.2 Tuning4.5 Random Forest
4.5.1 Application
4.5.2 Tuning4.6 Ada Boost
4.6.1 Application
4.6.2 Tuning4.7 Comparison
4.7.1 Feature Importance
4.7.2 Selection
NEED WRITEUP
The Naive Bayes Classifier will be used as a benchmark model for this work.
Bayes' Theorem is as such:
$$P\left(A|B\right) = \frac{P\left(B|A\right) \cdot P\left(A\right)}{P\left(B\right)}$$It is considered naive as it assumes each feature is independent of one another.
Bayes Theorem calculates the probability of an outcome (e.g. wether an individual recieves income exceeding 50k/yr), based on the joint probabilistic distributions of certain other events (e.g. any factors we include in the model).
As an example, I propose a model that always predicts an individual makes more than 50k/yr. This model has no false negatives; it has perfect recall (recall = 1).
Note: The purpose of generating a naive predictor is simply to show what a base model without any intelligence would look like. When there is no benchmark model set, getting a result better than random choice is a similar starting point.
Since this model always predicts a 1:
1 when 1 is true), equal to the sum of the label1 when 0 is true)0 when 0 is true) as no 0s are ever predicted0 when 1 is true) as no 0s are ever predictedNote: I set $\beta = \frac{1}{2}$ as I want to penalize false positives being costly for CharityML. Recall the implications of setting the values of $\beta$ from before
TP = np.sum(Y)
TN = 0
FP = len(Y) - TP
FN = 0
Beta = 1/2
accuracy = (TP + TN) / len(Y)
recall = TP / (TP + FN)
precision = TP / (TP + FP)
fscore = (1+Beta ** 2) * (precision * recall)/(((Beta ** 2) * precision) + recall)
print("Naive Predictor - Accuracy score: {:.4f}, F-score: {:.4f}".format(accuracy, fscore))
It can be useful to establish a routine for aspects related to modeling. This allows for standard comparison of outcomes generated from the same process.
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test):
"""
Pipeline to train, predict, and score algorithms
:param learner: the learning algorithm to be trained and predicted on
:param sample_size: the size of samples (number) to be drawn from training set
:param X_train: features training set
:param y_train: income training set
:param X_test: features testing set
:param y_test: income testing set
:return results: f-0.5 score, 0.5 chosen for high precision, avoiding false positives
"""
results = {}
# Fitting
start = time() # Get start time
learner.fit(X_train[:sample_size], y_train[:sample_size]) # Train model
end = time() # Get end time
results['train_time'] = end - start # Calculate the training time
# Predicting
start = time() # Get start time
predictions_test = learner.predict(X_test)
predictions_train = learner.predict(X_train[:300])
end = time() # Get end time
results['pred_time'] = end - start # Calculate the total prediction time
# Scoring
results['acc_train'] = accuracy_score(y_train[:300], predictions_train) # Training accuracy
results['acc_test'] = accuracy_score(y_test, predictions_test) # Testing accuracy
results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta=0.5) # Training F-0.5 score
results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5) # Testing F-0.5 score
# User feedback
print("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
return results
def trainer(classifer):
"""
Function to train each selected model in a routine fashion for comparison
:param classifier: classification model from Scikit-Learn to be trained
return step_results: outcome of training on the data and defined parameters
"""
step_results = {}
samples_100 = int(len(X_train))
samples_10 = int(len(X_train) / 10)
samples_1 = int(len(X_train) / 100)
clf_name = classifer.__class__.__name__
step_results[clf_name] = {}
for i,sample in enumerate([samples_1, samples_10, samples_100]):
step_results[clf_name][i] = train_predict(classifer, sample, X_train, y_train, X_test, y_test)
return step_results
def grid_tuner(classifier, parameters):
"""
Function to tune with grid search in a routine fashion
:param classifier: classification model from Scikit-Learn to be trained
return best_predictions: estimator which gave highest score
"""
scorer = make_scorer(fbeta_score, beta=0.5)
grid_obj = GridSearchCV(estimator=classifier, param_grid=parameters, scoring=scorer)
grid_fit = grid_obj.fit(X_train, y_train)
best_classifier = grid_fit.best_estimator_
predictions = (classifier.fit(X_train, y_train)).predict(X_test)
best_predictions = best_classifier.predict(X_test)
outcomes = {'test_acc': accuracy_score(y_test, predictions),
'f_test': fbeta_score(y_test, predictions, beta = 0.5),
'tuned_acc': accuracy_score(y_test, best_predictions),
'f_tuned': fbeta_score(y_test, best_predictions, beta = 0.5),
'best_param': grid_fit.best_params_}
print("Initial Model:")
print("\t Accuracy: {:.4f}".format(outcomes['test_acc']))
print("\t F0.5-Score: {:.4f}".format(outcomes['f_test']))
print("Tuned Model:")
print("\t Accuracy: {:.4f}".format(outcomes['tuned_acc']))
print("\t F0.5-Score: {:.4f}".format(outcomes['f_tuned']))
print("Best Parameters:")
print("\t {}".format(outcomes['best_param']))
return outcomes
Logistic regression produces probabilites of independent variables indicating a dependent variable. The outcome of logistic regression is bound between 0 and 1 (i.e. $ h_{\theta}\left(X\right) \in \left[0,1\right]$).
$$ h_{\theta}\left(X\right) = P\left(Y=1 | X\right)= \left\{ \begin{array}{ll} y=1 & \frac{1}{1+e^{-\left(\theta^{T}X\right)}} \\ y=0 & 1 - \frac{1}{1+e^{-\left(\theta^{T}X\right)}} \\ \end{array} \right. $$With a cost function of: $$ cost\left(h_{\theta}\left(X\right)\right) = \left(h_{\theta}\left(X\right)\right) \cdot \left(1 - h_{\theta}\left(X\right)\right)$$
Deriving and Minimizing the Cost Function: How does $ cost\left(h_{\theta}\left(X\right)\right) = \left(h_{\theta}\left(X\right)\right) \cdot \left(1 - h_{\theta}\left(X\right)\right)$, fall out of $\frac{1}{1+e^{-\left(\theta^{T}X\right)}}$ ?
The following math involves a knowledge of some single variable differential calculus, $y = x^{n} \rightarrow \frac{\Delta y}{\Delta x} = -n\cdot x^{n-1}$, and the chain rule, $\frac{\Delta}{\Delta x}f\left(g\left(x\right)\right)= f'\left(g\left(x\right)\right) \cdot g'\left(x\right)$:
$$h\left(x\right) = \frac{1}{1+e^{-x}}$$$$\frac{\Delta h\left(x\right)}{\Delta x} = \frac{\Delta}{\Delta x}\left(1+e^{-x}\right)^{-1}$$$$\because \frac{\Delta}{\Delta x}x^{n} = -n\cdot x^{n-1} \wedge \frac{\Delta}{\Delta x}f\left(g\left(x\right)\right)= f'\left(g\left(x\right)\right) \cdot g'\left(x\right) \implies$$$$\frac{\Delta}{\Delta x}\left(1+e^{-x}\right)^{-1} = -\left(1+e^{-x}\right)^{-2}\left(-e^{-x}\right) = \frac{-e^{-x}}{-\left(1+e^{-x}\right)^{2}} = \frac{e^{-x}}{\left(1+e^{-x}\right)} \cdot \frac{1}{\left(1+e^{-x}\right)}$$$$= \frac{\left(1+e^{-x}\right)-1}{\left(1+e^{-x}\right)} \cdot \frac{1}{\left(1+e^{-x}\right)} = \left(\frac{1+e^{-x}}{1+e^{-x}} - \frac{1}{1+e^{-x}}\right)\cdot \frac{1}{1+e^{-x}}$$$$= \left(1-\frac{1}{1+e^{-x}}\right) \cdot \frac{1}{1+e^{-x}} = \left(1-h\left(x\right)\right) \cdot h\left(x\right) \square$$%%time
log_reg_0 = trainer(classifer=LogisticRegression(random_state=0))
print()
print("Logistic Regression: Default")
display(pd.DataFrame.from_dict(log_reg_0['LogisticRegression'], orient='index'))
%%time
log_reg_1 = trainer(classifer=LogisticRegression(penalty='l2', max_iter=500, random_state=0, solver='liblinear'))
print()
print("Logistic Regression: Mindful Parameters")
display(pd.DataFrame.from_dict(log_reg_1['LogisticRegression'], orient='index'))
%%time
lr_2 = LogisticRegressionCV(random_state=0, max_iter=500, penalty='l2', solver='liblinear')
log_reg_2 = trainer(classifer=lr_2)
print()
print("Logistic Regression with CV")
display(pd.DataFrame.from_dict(log_reg_2['LogisticRegressionCV'], orient='index'))
Try without the variables that have higher relationships and see the effect on the model
# drop mar_stat, rel, occ
data.columns
# tune_data = data[]
NEED WRITEUP
%%time
rand_for_0 = trainer(classifer=RandomForestClassifier(random_state=0))
print()
print("Random Forest: Default")
display(pd.DataFrame.from_dict(rand_for_0['RandomForestClassifier'], orient='index'))
rand_for_1 = trainer(classifer=RandomForestClassifier(n_estimators=500, min_samples_leaf=25,random_state=0))
print()
print("Random Forest: Tuned")
display(pd.DataFrame.from_dict(rand_for_1['RandomForestClassifier'], orient='index'))
%%time
parameters = {'n_estimators': [100, 200, 500],
"min_samples_leaf": [5, 10, 20]}
rf_tune_0 = grid_tuner(classifier=RandomForestClassifier(random_state=0), parameters=parameters)
%%time
parameters = {'n_estimators': [150, 200, 250],
"min_samples_leaf": [3, 5, 7]}
rf_tune_1 = grid_tuner(classifier=RandomForestClassifier(random_state=0), parameters=parameters)
%%time
parameters = {'n_estimators': [225, 250, 275],
"min_samples_leaf": [2, 3, 4]}
rf_tune_2 = grid_tuner(classifier=RandomForestClassifier(random_state=0), parameters=parameters)
rand_for_2 = trainer(classifer=RandomForestClassifier(random_state=0, n_estimators=250, min_samples_leaf= 2))
print()
print("Random Forest: Gridded")
display(pd.DataFrame.from_dict(rand_for_2['RandomForestClassifier'], orient='index'))
NEED WRITEUP
%%time
abc_0 = trainer(classifer=AdaBoostClassifier(random_state=0))
print()
print("Ada Boost Classifier: Default")
display(pd.DataFrame.from_dict(abc_0['AdaBoostClassifier'], orient='index'))
%%time
parameters = {'n_estimators': [200, 400],
'learning_rate': [1, 1.5]}
abc_tune_0 = grid_tuner(classifier=AdaBoostClassifier(random_state=0), parameters=parameters)
%%time
parameters = {'n_estimators': [400, 800, 1000],
"learning_rate": [1.4, 1.6, 1.8]}
abc_tune_1 = grid_tuner(classifier=AdaBoostClassifier(random_state=0), parameters=parameters)
%%time
parameters = {'n_estimators': [700, 800, 900],
"learning_rate": [1.5, 1.6, 1.7]}
abc_tune_2 = grid_tuner(classifier=AdaBoostClassifier(random_state=0), parameters=parameters)
%%time
parameters = {'n_estimators': [750, 800, 850],
"learning_rate": [1.55, 1.6, 1.65]}
abc_tune_3 = grid_tuner(classifier=AdaBoostClassifier(random_state=0), parameters=parameters)
%%time
rf_tuned = RandomForestClassifier(n_estimators=250, min_samples_leaf=2,random_state=0)
abc_tune_3 = AdaBoostClassifier(random_state=0, base_estimator=rf_tuned, n_estimators=400, learning_rate=1.5)
abc_trained = trainer(classifer=abc_tune_3)
print()
print("Ada Boost Classifier: Tuned with RF Tuned Classifier")
display(pd.DataFrame.from_dict(abc_trained['AdaBoostClassifier'], orient='index'))
%%time
rf_tuned_1 = RandomForestClassifier(n_estimators=250, min_samples_leaf=2,random_state=0)
abc_tune_4 = AdaBoostClassifier(random_state=0, base_estimator=rf_tuned_1, n_estimators=800, learning_rate=1.6)
abc_trained_1 = trainer(classifer=abc_tune_4)
print()
print("Ada Boost Classifier: Tuned with RF Tuned Classifier")
display(pd.DataFrame.from_dict(abc_trained_1['AdaBoostClassifier'], orient='index'))
%%time
abc_tune_5 = AdaBoostClassifier(random_state=0, n_estimators=400, learning_rate=1.5)
abc_trained_2 = trainer(classifer=abc_tune_5)
print()
print("Ada Boost Classifier: Tuned")
display(pd.DataFrame.from_dict(abc_trained_2['AdaBoostClassifier'], orient='index'))
# # Full Page - Code
!jupyter nbconvert donor_class.ipynb --output class_code --reveal-prefix=reveal.js --SlidesExporter.reveal_theme=serif --SlidesExporter.reveal_scroll=True --SlidesExporter.reveal_transition=none
# # Full Page - No Code
!jupyter nbconvert donor_class.ipynb --output class_no_code --reveal-prefix=reveal.js --SlidesExporter.reveal_theme=serif --SlidesExporter.reveal_scroll=True --SlidesExporter.reveal_transition=none --TemplateExporter.exclude_input=True
# # Slides - No Code
!jupyter nbconvert --to slides donor_class.ipynb --output class_slides --TemplateExporter.exclude_input=True --SlidesExporter.reveal_transition=none --SlidesExporter.reveal_scroll=True